clear
set more off
macro drop all
capture log close

/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection
Create Pre Variables

Created on: 2/26/2022
Last Modified on: 2/13/2024

Description: This do file creates baseline variables.

Note that we have removed the file directory names from this program for 
confidentiality reasons. 
********************************************************************************/

** Setting the Directory
global rawdata 
global cleandata 
global tmp 

/********************************************************************************

For all children, these should be defined before the investigation date.

I will create the following PRE variables:

(1) SOCIO-DEMOGRAPHIC:
	-# Previous CPS Investigations
	-Age
	-Grade in School
	-Gender
	-Race
	-Free/reduced lunch

(2) ACADEMIC:
	-Attendance rate
	-Chronically absent
	-Special Ed
	-Test scores (G3-G8, ACT/SAT)
	-Test score proficiency (G3-G8, ACT/SAT)
	
(3) DISCIPLINE:
	-Ever expelled
	-Ever suspended

(4) SCHOOL ENVIRONMENT:
	-# Different Schools Attended
	-Urbanicity
	-Charter
	-School size
	-Racial Composition in School
	-% Frpl in School

(5) NEIGHBORHOOD ENVIRONMENT:
	-# Different Census Blocks Lived In
	-Median Income
	-Employment Rate
	-% with BA or Higher
	-Racial Composition of Neighborhood
	-Homeless
	
I will think of some of these measures as time invariant (race, gender, age at investigation)
and others as time varying (attendance rate, school size).  For the time varying measure,
I want to create variables for both the AVERAGE measure across all PRE observations and
also a variable for the LAST measure of all PRE observations (eg. track the average 
attendance rate of the child across all PRE obs and also track their attendance
rate in the year just before they were removed/investigated).
	
*******************************************************************************/

**Load student*year panel
use "$cleandata/student_year_panel.dta", clear

***********************
***(1) SOCIO-DEMOGRAPHIC PRE MEASURES
***********************

*****NUMBER OF PREVIOUS CPS INVESTIGATIONS****
rename n_prior_inv pre_n_prior_inv
la var pre_n_prior_inv "# Prior CPS Investigations"

*****AGE*****
gen cps_date=complaint_date
la var cps_date "Date of Investigation"

personage dob cps_date, gen(cps_age)
replace cps_age=0 if cps_age<0 & cps_age!=.
replace cps_age=17 if cps_age>18 & cps_age!=.
la var cps_age "Age at Investigation"

*****GRADE IN SCHOOL*****
**Impute missing values for grades based on expected grade progression
sort ric inv_caseid year
replace grade_fnl=grade_fnl[_n+1]-1 if grade_fnl==. & ric==ric[_n+1] & inv_caseid==inv_caseid[_n+1] & year==year[_n+1]-1 & inrange(grade_fnl[_n+1],1,12)
local i=1
while `i'<=5 {
	replace grade_fnl=30 if grade_fnl==. & ric==ric[_n+1] & inv_caseid==inv_caseid[_n+1] & (grade_fnl[_n+1]==0 | grade_fnl[_n+1]==30)
	replace grade_fnl=14 if grade_fnl==. & ric==ric[_n+1] & inv_caseid==inv_caseid[_n+1] & grade_fnl[_n+1]==14
	local ++i
}
*****note: the remaining obs with missing grades are where students show up in the data in 2003 and then not again for GK in 2011.
*****Just drop these weird obs.
drop if grade_fnl==.

*If the complaint/removal occured between Jan-Aug, assign the student's grade in that
*year. If the complaint/removal occured in Sep, if it was before the start of the school year,
*assign the grade in that year but if it was after the start of the school
*year, assign the grade in the next year. If the complaint occurred in Oct-Dec, assign
*the student's grade in the next year.
local start2003=mdy(9,3,2002)
local start2004=mdy(9,2,2003)
local start2005=mdy(9,7,2004)
local start2006=mdy(9,6,2005)
local start2007=mdy(9,5,2006)
local start2008=mdy(9,4,2007)
local start2009=mdy(9,2,2008)
local start2010=mdy(9,8,2009)
local start2011=mdy(9,7,2010)
local start2012=mdy(9,6,2011)
local start2013=mdy(9,4,2012)
local start2014=mdy(9,3,2013)
local start2015=mdy(9,2,2014)
local start2016=mdy(9,8,2015)
local start2017=mdy(9,5,2016)
gen startdate=.
forv y=2003/2017 {
	replace startdate=`start`y'' if year==`y'
}
la var startdate "Date of first day of school"

gen cps_month=month(cps_date)
la var cps_month "Month of Investigation"
rename complaint_year cps_year
la var cps_year "Year of Investigation"


gen cps_sy=.
la var cps_sy "SY of Investigation"
replace cps_sy=cps_year if cps_month>=1 & cps_month<=8
replace cps_sy=cps_year if cps_month==9 & day(cps_date)<day(startdate)
replace cps_sy=cps_year+1 if cps_month==9 & day(cps_date)>=day(startdate)
replace cps_sy=cps_year+1 if cps_month>=10

gen cps_grade_tmp=grade_fnl if year==cps_sy
gegen cps_grade=max(cps_grade_tmp), by(ric inv_caseid)
drop cps_grade_tmp
la var cps_grade "Grade in SY of Inv. Missing= student was not in public school in SY of inv"

*****GENDER*****
rename female pre_female
la var pre_female "Female"

*****RACE*****
local i=1
gen pre_racecat=.
foreach r in white black amerin asianamer hawaiian hisp {
	rename `r' pre_`r'
	replace pre_racecat=`i' if pre_`r'==1
	local i=`i'+1
}
la def r 1 "White" 2 "Black" 3 "American Indian" 4 "Asian" 5 "Hawaiian/Pacific Islander" 6 "Hispanic"
la values pre_racecat r	
la var pre_racecat "Race Category"
la var pre_white "White"
la var pre_black "Black"
la var pre_hisp "Hispanic"
la var pre_asianamer "Asian"
la var pre_hawaiian "Hawaiian/Pacific Islander"
la var pre_amerin "American Indian"
	
*****FREE/REDUCED PRICE LUNCH ELIGIBILITY*****
gen pre_poor_tmp=poor if year==cps_sy-1
gegen pre_poor=max(pre_poor_tmp), by(ric inv_caseid)
la var pre_poor "Poor in SY Before Investigation"
drop pre_poor_tmp

gegen pre_poor_allyrs_tmp=mean(poor) if year<cps_sy, by(ric inv_caseid)
gegen pre_poor_allyrs=max(pre_poor_allyrs_tmp), by(ric inv_caseid)
la var pre_poor_allyrs "Avg Poor in All Pre Years"
drop pre_poor_allyrs_tmp

***********************
***(2) ACADEMIC PRE MEASURES
***********************

*****ATTENDANCE RATE, CHRONICALLY ABSENT & SPECIAL ED*****
rename frac_attend3 attend
gen chronic_abs=1 if attend<0.9 & attend!=. 
replace chronic_abs=0 if attend>0.9 & attend!=.
rename speddummy sped

foreach x in attend chronic_abs sped {
	gen pre_`x'_tmp=`x' if year==cps_sy-1
	gegen pre_`x'=max(pre_`x'_tmp), by(ric inv_caseid)
	drop pre_`x'_tmp

	gegen pre_`x'_allyrs_tmp=mean(`x') if year<cps_sy, by(ric inv_caseid)
	gegen pre_`x'_allyrs=max(pre_`x'_allyrs_tmp), by(ric inv_caseid)
	drop pre_`x'_allyrs_tmp
}

la var pre_attend "Attendance Rate in SY Before Investigation"
la var pre_attend_allyrs "Avg Attendance Rate in All Pre Years"
la var pre_chronic_abs "Chronically Absent Before SY of Investigation"
la var pre_chronic_abs_allyrs "Avg Chronically Absent in All Pre Years"
la var pre_sped "Special Ed in SY Before Investigation"
la var pre_sped_allyrs "Avg Special Ed in All Pre Years"

**REPEATED GRADE
sort ric inv_caseid year
gen pre_rep_grade_tmp=1 if grade_fnl==grade_fnl[_n-1] & ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year<cps_sy
replace pre_rep_grade_tmp=0 if grade_fnl!=grade_fnl[_n-1] & ric==ric[_n-1] & inv_caseid==inv_caseid[_n-1] & year<cps_sy
gegen pre_rep_grade=max(pre_rep_grade_tmp), by(ric inv_caseid)
drop pre_rep_grade_tmp
la var pre_rep_grade "Ever repeated grade before investigation"

*****TEST SCORES & TEST SCORE PROFICIENCY*****

**Note: Since exams are taken either in the Spring or Fall depending on the year, 
**count scores as happening before Investigation if they happened in the
**school year PRIOR to Investigation

foreach subj in math reading {
	gen `subj'prof=0
	replace `subj'prof=1 if `subj'pl_post2011<=2
	replace `subj'prof=. if `subj'pl_post2011==.
}

**Grade 3-8 exams: Variable for took MEAP/MSTEP and then standardized score
forvalues grade=3/8 {
	foreach subj in math reading {
	
		**took MEAP/MSTEP
		gen pre_took_`subj'_g`grade'_tmp=(`subj'stdss!=.) if grade_fnl==`grade' & year<cps_sy
		gegen pre_took_`subj'_g`grade'=max(pre_took_`subj'_g`grade'_tmp), by(ric inv_caseid)
		drop pre_took_`subj'_g`grade'_tmp
		
		**MEAP/MSTEP score
		gen pre_`subj'_g`grade'_tmp=`subj'stdss if pre_took_`subj'_g`grade'==1 & grade_fnl==`grade' & year<cps_sy
		gegen pre_`subj'_g`grade'=max(pre_`subj'_g`grade'_tmp), by(ric inv_caseid)
		drop pre_`subj'_g`grade'_tmp
		
		**Proficient on MEAP/MSTEP
		gen pre_`subj'prof_g`grade'_tmp=`subj'prof if pre_took_`subj'_g`grade'==1 & grade_fnl==`grade' & year<cps_sy
		gegen pre_`subj'prof_g`grade'=max(pre_`subj'prof_g`grade'_tmp), by(ric inv_caseid)
		drop pre_`subj'prof_g`grade'_tmp		
				
	}
	
	la var pre_took_math_g`grade' "Took Math MEAP/MSTEP in Grade `grade'"
	la var pre_took_reading_g`grade' "Took Reading MEAP/MSTEP in Grade `grade'"
	la var pre_math_g`grade' "Std Math Score on MEAP/MSTEP in Grade `grade'"
	la var pre_reading_g`grade' "Std Reading Score on MEAP/MSTEP in Grade `grade'"
	la var pre_mathprof_g`grade' "Proficient on Math MEAP/MSTEP in Grade `grade'"
	la var pre_readingprof_g`grade' "Proficient on Reading MEAP/MSTEP in Grade `grade'"

}

**Interaction and higher order math and reading scores
gen pre_math=pre_math_g8
gen pre_reading=pre_reading_g8
forvalues i=1/5 {
	local g=8-`i'
	replace pre_math=pre_math_g`g' if pre_math==.
	replace pre_reading=pre_reading_g`g' if pre_reading==.
}
la var pre_math "Most recent math score before inv"
la var pre_reading "Most recent reading score before inv"
gen pre_math_reading=pre_math*pre_reading
gen pre_math2=pre_math^2
gen pre_math3=pre_math^3
gen pre_reading2=pre_reading^2
gen pre_reading3=pre_reading^3

**ACT/SAT exams- take highest score of all retakes
gen pre_took_actsat_tmp=(actcompositestd!=. | satcompositestd!=.) if inrange(grade_fnl,10,12) & year<cps_sy
gegen pre_took_actsat=max(pre_took_actsat_tmp), by(ric inv_caseid)
drop pre_took_actsat_tmp
la var pre_took_actsat "Took ACT/SAT in Pre Year"

gegen pre_actstd_tmp=max(actcompositestd) if year<cps_sy, by(ric inv_caseid)
gegen pre_satstd_tmp=max(satcompositestd) if year<cps_sy, by(ric inv_caseid)
gen pre_actsatstd_tmp=pre_actstd_tmp
replace pre_actsatstd_tmp=pre_satstd_tmp if pre_actsatstd_tmp==. | (pre_actsatstd_tmp!=. & pre_actsatstd_tmp<pre_satstd_tmp) 
gegen pre_actsatstd=max(pre_actsatstd_tmp), by(ric inv_caseid)
la var pre_actsatstd "Standardized ACT/SAT Score in Pre Years"
drop pre_actsatstd_tmp pre_actstd* pre_satstd*

***********************
***(3) DISCIPLINE
***********************

**note: discipline records begin in 2010, so set to missing for all years where we
**don't have data available
gen expelled=(expulsion>0) if year>=2010
gen suspended=(suspension>0) if year>=2010

foreach x in expelled suspended {
	gen pre_`x'_tmp=`x' if year==cps_sy-1
	gegen pre_`x'=max(pre_`x'_tmp), by(ric inv_caseid)
	drop pre_`x'_tmp

	gegen pre_`x'_allyrs_tmp=mean(`x') if year<cps_sy, by(ric inv_caseid)
	gegen pre_`x'_allyrs=max(pre_`x'_allyrs_tmp), by(ric inv_caseid)
	drop pre_`x'_allyrs_tmp
}

la var pre_expelled "Expelled in SY Before Investigation"
la var pre_expelled_allyrs "Avg Expelled in All Pre Years"
la var pre_suspended "Suspended in SY Before Investigation"
la var pre_suspended_allyrs "Avg Suspended in All Pre Years"

***********************
***(4) SCHOOL ENVIRONMENT
***********************

*****# DIFFERENT SCHOOLS ATTENDED*****
sort ric year
tempfile old
save `old'

use "$cleandata/school_list_student_year.dta", clear
merge 1:m ric year using `old'
tab _merge
drop if _merge==1
drop _merge

gen pre_n_schools_tmp=n_schools_before_year if year==cps_year-1
gegen pre_n_schools=max(pre_n_schools_tmp), by(ric inv_caseid)
la var pre_n_schools "# School Transitions Before Investigation"
drop pre_n_schools_tmp

*****URBANICITY, CHARTER, SCHOOL SIZE, RACIAL COMPOSITION IN SCHOOL*****
gen sch_urban=(urbanicity==1)
gen sch_suburb=(urbanicity==2)
gen sch_rural_town=(urbanicity==3 | urbanicity==4)
foreach x in urban suburb rural_town {
	replace sch_`x'=. if urbanicity==.
}

rename enroll sch_size 
rename per_wh sch_white
rename per_bl sch_black
rename per_hi sch_hisp
rename per_frl sch_frpl

foreach x in sch_urban sch_suburb sch_rural_town charter sch_size sch_white ///
	sch_black sch_hisp sch_frpl {
	gen pre_`x'_tmp=`x' if year==cps_sy-1
	gegen pre_`x'=max(pre_`x'_tmp), by(ric inv_caseid)
	drop pre_`x'_tmp
	
	gegen pre_`x'_allyrs_tmp=mean(`x') if year<cps_sy, by(ric inv_caseid)
	gegen pre_`x'_allyrs=max(pre_`x'_allyrs_tmp), by(ric inv_caseid)
	drop pre_`x'_allyrs_tmp
}

la var pre_sch_urban "Urban School in SY Before Investigation"
la var pre_sch_urban_allyrs "Avg Urban School in All Pre Years"
la var pre_sch_suburb "Suburban School in SY Before Investigation"
la var pre_sch_urban_allyrs "Avg Suburban School in All Pre Years"
la var pre_sch_rural_town "Rural/Town School in SY Before Investigation"
la var pre_sch_rural_town_allyrs "Avg Rural/Town School in All Pre Years"
la var pre_charter "Charter School in SY Before Investigation"
la var pre_charter_allyrs "Avg Charter School in All Pre Years"
la var pre_sch_size "# Students in School in SY Before Investigation"
la var pre_sch_size_allyrs "Avg # Students in School in All Pre Years"
la var pre_sch_white "% White in School in SY Before Investigation"
la var pre_sch_white_allyrs "Avg % White in School in All Pre Years"
la var pre_sch_black "% Black in School in SY Before Investigation" 
la var pre_sch_black_allyrs "Avg % Black in School in All Pre Years"
la var pre_sch_hisp "% Hispanic in School in SY Before Investigation"
la var pre_sch_hisp_allyrs "Avg % Hispanic in School in All Pre Years"
la var pre_sch_frpl "% FRPL in School in SY Before Investigation" 
la var pre_sch_frpl_allyrs "Avg % Hispanic in School in All Pre Years"

***********************
***(4) NEIGHBORHOOD ENVIRONMENT
***********************

sort ric year
tempfile old
save `old'

use "$cleandata/cb_list_student_year.dta", clear
merge 1:m ric year using `old'
tab _merge
drop if _merge==1
drop _merge

*****# DIFFERENT CENSUS BLOCK GROUPS LIVED IN*****
gen pre_n_cbg_tmp=n_cb_before_year if year==cps_year-1
bysort ric inv_caseid: egen pre_n_cbg=max(pre_n_cbg_tmp)
la var pre_n_cbg "# Nbhd's Lived in Before Investigation"
drop pre_n_cbg_tmp

*****MEDIAN INCOME, EMPLOYMENT RATE, % BA OR HIGHER, RACIAL COMPOSITION IN NBHD, HOMELESS*****
rename cbg_hhincome nbhd_medinc
rename cbg_employed nbhd_emp
gen nbhd_bapl=cbg_edu_ba+cbg_edu_pro+cbg_edu_ms
rename cbg_white nbhd_white
rename cbg_black nbhd_black
rename cbg_hispanic nbhd_hisp

foreach x in nbhd_medinc nbhd_emp nbhd_bapl nbhd_white nbhd_black nbhd_hisp homeless {
	gen pre_`x'_tmp=`x' if year==cps_sy-1
	gegen pre_`x'=max(pre_`x'_tmp), by(ric inv_caseid)
	drop pre_`x'_tmp
	
	gegen pre_`x'_allyrs_tmp=mean(`x') if year<cps_sy, by(ric inv_caseid)
	gegen pre_`x'_allyrs=max(pre_`x'_allyrs_tmp), by(ric inv_caseid)
	drop pre_`x'_allyrs_tmp
	
	if "`x'"!="nbhd_medinc" {
		replace pre_`x'=pre_`x'/100
		replace pre_`x'_allyrs=pre_`x'_allyrs/100
	}
}


la var pre_nbhd_medinc "HH Median Income in CBG in SY Before Investigation"
la var pre_nbhd_medinc_allyrs "Avg HH Median Income in CBG in All Pre Years"
la var pre_nbhd_emp "Employment Rate in CBG in SY Before Investigation"
la var pre_nbhd_emp_allyrs "Avg Employment Rate in CBG in All Pre Years"
la var pre_nbhd_bapl "% BA or Higher in CBG in SY Before Investigation"
la var pre_nbhd_bapl_allyrs "Avg % BA or Higher in CBG in All Pre Years"
la var pre_nbhd_white "% White in CBG in SY Before Investigation"
la var pre_nbhd_white_allyrs "Avg % White in CBG in All Pre Years"
la var pre_nbhd_black "% Black in CBG in SY Before Investigation"
la var pre_nbhd_black_allyrs "Avg % Black in CBG in All Pre Years"
la var pre_nbhd_hisp "% Hispanic in CBG in SY Before Investigation"
la var pre_nbhd_hisp_allyrs "Avg % Hispanic in CBG in All Pre Years"
la var pre_homeless "Homeless in SY Before Investigation"
la var pre_homeless_allyrs "Avg Homeless in All Pre Years"

compress
sort ric inv_caseid year
save "$cleandata/student_year_panel_withpre.dta", replace

























